import textgrid
import os
import sys
import math

from tqdm import tqdm

# cmudict_keys = []
# with open("mfa/librispeech_mfa/cmu/cmudict.txt") as f:
#     for line in tqdm(f, desc="cmudict"):
#         key, value = line.rstrip().split(" ", maxsplit=1)
#         cmudict_keys.append(key)

oov_utt = []
nov_utt = []
count = 0 

with open("data/dev_oov_utt.txt", "r") as f:
    for line in f:
        oov_utt.append(line.rstrip())

with open("dump/raw/dev/text") as f:
    for line in tqdm(f, desc="oov_utt"):
        count += 1
        key, value = line.rstrip().split(" ", maxsplit=1)
        if key not in oov_utt:
            nov_utt.append(key)

with open("data/dev_nov_utt.txt", "w") as f:
    for utt in nov_utt:
        f.write(utt)
        f.write('\n')

print(f"oov count: {len(oov_utt)} / {count}")
print(f"nov count: {len(nov_utt)} / {count}")
print(f"oov rate: {len(oov_utt) * 1.0 / count}")